Trends in Data Science & Business Analytics
  • Home
  • Data Cleaning & Exploration
    • Data Cleaning
    • Exploratory Data Analysis
    • Skill Gap Analysis
  • Machine Learning Methods
    • Supervised Machine Learning
    • Unsupervised Machine Learning

Exploratory Data Analytics

Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")
Code
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)

eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
eda['DATA_ANALYST_JOB'].value_counts()
DATA_ANALYST_JOB
False    38212
True     33042
Name: count, dtype: int64
Code
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_grouped = (
    eda
    .groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
    .size()
    .reset_index(name='Job_Count')
)

short_names = {
    'Professional, Scientific, and Technical Services': 'Tech. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})

pivot = (
    df_grouped
    .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
    .reset_index()
)
industries = pivot['Industry'].tolist()
y_true  = pivot['True'].tolist()
y_false = pivot['False'].tolist()


# 2) Build a 2-row subplot: bar on top, table below

fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.70, 0.30],           # give a bit more room to the table
    specs=[[{"type":"bar"}],[{"type":"table"}]],
    vertical_spacing=0.12              # more space between bar and table
)

colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}

fig.add_trace(
    go.Bar(
        x=industries, y=y_true, name='True',
        marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
        text=y_true, textposition='outside'
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=industries, y=y_false, name='False',
        marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
        text=y_false, textposition='outside'
    ),
    row=1, col=1
)



# 3) Slider steps: 0 → 8 000 in 200s

steps = []
for val in range(0, 8001, 200):
    steps.append(dict(
        label=str(val),
        method="update",
        args=[
            {"y": [
                [v if v>=val else 0 for v in y_true],
                [v if v>=val else 0 for v in y_false]
            ]},
            {"title": f"Min Jobs ≥ {val:,}"}
        ]
    ))


# 4) Final layout tweaks

fig.update_layout(
    # lift slider above everything
    sliders=[dict(
        active=0,
        currentvalue={"prefix":"Min Jobs: "},
        pad={"b":0},
        x=0.05,
        y=1.05,                # move slider way above the plot area
        xanchor="left",
        yanchor="bottom",
        len=0.7,
        font=dict(color='#A81D1D'),
        steps=steps
    )],

    title=dict(
        text="Data & Business Analytics Job Trends",
        font=dict(size=24, color='#A81D1D'),
        x=0.5,
        y=0.95,                # drop the title just below the slider
        xanchor="center",
        yanchor="top"
    ),

    width=1100, height=850,
    margin=dict(l=60, r=60, t=180, b=200),  # extra top & bottom margin

    plot_bgcolor='white',
    paper_bgcolor='white',

    xaxis=dict(
        title="Industry",
        title_font=dict(size=16, color='#A81D1D'),
        tickmode='array',
        tickvals=list(range(len(industries))),
        ticktext=industries,
        tickangle=-30,
        tickfont=dict(size=11, color='#333'),
        showline=True, linecolor='#A81D1D'
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16, color='#A81D1D'),
        tickfont=dict(size=11, color='#333'),
        gridcolor='rgba(200,200,200,0.3)',
        showline=True, linecolor='#A81D1D',
        range=[0, max(max(y_true),max(y_false))*1.2]
    ),

    legend=dict(
        title="Data Analyst Job",
        title_font=dict(color='#A81D1D'),
        font=dict(size=12),
        x=0.95, y=0.95
    ),

    bargap=0.2
)

fig.write_html(
    "figures/edaplot1.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 2: Clean the data (remove rows with missing SPECIALIZED_SKILLS_NAME)
df = df.dropna(subset=['SPECIALIZED_SKILLS_NAME'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Split the SPECIALIZED_SKILLS_NAME into individual skills
# Assuming SPECIALIZED_SKILLS_NAME is a string of skills separated by commas or another delimiter
df_skills = df.copy()
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.split(',')  # Adjust delimiter if needed
df_skills = df_skills.explode('SPECIALIZED_SKILLS_NAME')
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.strip()

# Step 4: Group by skill and Job_Category to get the count
df_skills_count = df_skills.groupby(['SPECIALIZED_SKILLS_NAME', 'Job_Category']).size().reset_index(name='Count')

# Step 5: Get the top 10 skills by total count
top_skills = df_skills_count.groupby('SPECIALIZED_SKILLS_NAME')['Count'].sum().nlargest(10).index
df_skills_top = df_skills_count[df_skills_count['SPECIALIZED_SKILLS_NAME'].isin(top_skills)]

# Debug: Check the grouped data
print("Top 10 specialized skills:")
print(df_skills_top)

# Step 6: Create the bar plot
fig = px.bar(
    df_skills_top,
    x='Count',
    y='SPECIALIZED_SKILLS_NAME',
    color='Job_Category',
    barmode='stack',
    color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
    title='Top 10 Specialized Skills by Job Category'
)

# Step 7: Update layout for styling
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    xaxis=dict(
        title='Number of Jobs',
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='#E2E8F0',
        linecolor='#2D3748',
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title='Specialized Skill',
        title_font=dict(size=16),
        tickfont=dict(size=12)
    ),
    legend=dict(
        title='Job Category',
        font=dict(size=13),
        bgcolor='#FFFFFF',
        bordercolor='#FF6B6B',
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor='left',
        yanchor='middle'
    )
)



# Save to HTML
fig.write_html(
    'figures/edaplot2.html',
    include_plotlyjs='cdn',
    full_html=False
)
Number of rows after cleaning: 71254
Top 10 specialized skills:
               SPECIALIZED_SKILLS_NAME       Job_Category  Count
2071           "Business Intelligence"      Analytics Job   8077
2072           "Business Intelligence"  Non-Analytics Job   1778
2167                "Business Process"      Analytics Job   4417
2168                "Business Process"  Non-Analytics Job   8385
2180           "Business Requirements"      Analytics Job   4972
2181           "Business Requirements"  Non-Analytics Job   7740
4151                       "Dashboard"      Analytics Job   9975
4152                       "Dashboard"  Non-Analytics Job   1559
4173                   "Data Analysis"      Analytics Job  25620
6640                         "Finance"      Analytics Job   5573
6641                         "Finance"  Non-Analytics Job   6220
13332             "Project Management"      Analytics Job   6236
13333             "Project Management"  Non-Analytics Job   6939
13594  "Python (Programming Language)"      Analytics Job  10127
13595  "Python (Programming Language)"  Non-Analytics Job   1703
14459               "SAP Applications"      Analytics Job   2174
14460               "SAP Applications"  Non-Analytics Job   9149
14686     "SQL (Programming Language)"      Analytics Job  16067
14687     "SQL (Programming Language)"  Non-Analytics Job   4202
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Step 2: Clean the data (remove rows with missing values for the variables we need)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'REMOTE_TYPE_NAME', 'MIN_EDULEVELS_NAME'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 4: Encode categorical variables numerically for the parallel coordinates plot
# Encode REMOTE_TYPE_NAME (e.g., Remote=2, Hybrid=1, On-site=0)
remote_mapping = {'Remote': 2, 'Hybrid': 1, 'On-site': 0}  # Adjust based on your unique values
df['Remote_Type_Code'] = df['REMOTE_TYPE_NAME'].map(remote_mapping)

# Encode MIN_EDULEVELS_NAME (e.g., High School=0, Bachelor's=1, Master's=2, PhD=3)
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3}  # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)

# Step 5: Clean again after encoding (in case some mappings failed)
df = df.dropna(subset=['Remote_Type_Code', 'Edu_Level_Code'])

# Debug: Check the encoded data
print("Sample encoded data:")
print(df[['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code', 'Job_Category']].head())

# Step 6: Create the parallel coordinates plot
fig = px.parallel_coordinates(
    df,
    dimensions=['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code'],
    color_continuous_scale=['#4ECDC4', '#FF6B6B'],  # Gradient from teal to red
    color=df['Job_Category'].map({'Analytics Job': 1, 'Non-Analytics Job': 0}),  # Color by Job_Category
    labels={
        'SALARY': 'Salary ($)',
        'Avg_Years_Experience': 'Avg. Years Experience',
        'Remote_Type_Code': 'Remote Type (0=On-site, 1=Hybrid, 2=Remote)',
        'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
    },
    title='Multidimensional Comparison of Job Attributes by Job Category'
)

# Step 7: Update layout for styling
fig.update_layout(
    width=1000,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    )
)



fig.write_html(
    "figures/edaplot3.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Number of rows after cleaning: 71254
Sample encoded data:
Empty DataFrame
Columns: [SALARY, Avg_Years_Experience, Remote_Type_Code, Edu_Level_Code, Job_Category]
Index: []
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])

# Create the scatter plot with trend line
fig = px.scatter(df, 
                 x='Avg_Years_Experience', 
                 y='SALARY', 
                 color='Job_Category',
                 trendline='ols',  # Add trend line (ordinary least squares)
                 title='Experience Requirements vs Salary for Analytics Jobs',
                 labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
                 color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Average Years of Experience",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Customize scatter points
fig.update_traces(
    marker=dict(
        size=8,
        opacity=0.7,
        line=dict(width=1, color="#2D3748")
    )
)


fig.write_html(
    "figures/edaplot4.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Step 2: Clean the data (remove rows with missing values)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'MIN_EDULEVELS_NAME', 'POSTED'])

# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))

# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 4: Encode MIN_EDULEVELS_NAME numerically
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3}  # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)

# Step 5: Clean again after encoding
df = df.dropna(subset=['Edu_Level_Code'])

# Step 6: Extract a time component from POSTED (e.g., year or year-month)
df['POSTED'] = pd.to_datetime(df['POSTED'])  # Ensure POSTED is in datetime format
df['Year_Month'] = df['POSTED'].dt.to_period('M').astype(str)  # Extract year-month for animation

# Debug: Check the time range
print("Time range for animation:", df['Year_Month'].unique())

# Step 7: Create the 3D scatter plot with animation
fig = px.scatter_3d(
    df,
    x='Avg_Years_Experience',
    y='SALARY',
    z='Edu_Level_Code',
    color='Job_Category',
    color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
    animation_frame='Year_Month',  # Animate over time
    title='Salary, Experience, and Education Over Time by Job Category',
    labels={
        'Avg_Years_Experience': 'Avg. Years Experience',
        'SALARY': 'Salary ($)',
        'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
    },
    hover_data=['Job_Category', 'Year_Month']
)

# Step 8: Update layout for styling
fig.update_layout(
    width=900,
    height=600,
    scene=dict(
        xaxis_title='Avg. Years Experience',
        yaxis_title='Salary ($)',
        zaxis_title='Min. Education Level',
        xaxis=dict(tickfont=dict(size=12)),
        yaxis=dict(tickfont=dict(size=12)),
        zaxis=dict(tickfont=dict(size=12))
    ),
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    legend=dict(
        title='Job Category',
        font=dict(size=13),
        bgcolor='#FFFFFF',
        bordercolor='#FF6B6B',
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor='left',
        yanchor='middle'
    )
)


# Save the file
fig.write_html(
    "figures/edaplot5.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Number of rows after cleaning: 71254
Time range for animation: []
Code
import plotly.express as px
import pandas as pd
import numpy as np

# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()

# Step 1: Clean the data (remove rows with missing STATE_NAME)
df = df.dropna(subset=['STATE_NAME'])

# Step 2: Handle any anomalies in STATE_NAME (e.g., "[Unknown City], AR" should not affect STATE_NAME)
# Since STATE_NAME is already provided (e.g., "Arkansas"), we can proceed directly

# Debug: Check the number of rows after cleaning and unique states
print("Number of rows after cleaning:", len(df))
print("Unique states extracted:", df['STATE_NAME'].unique())

# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Step 4: Aggregate data by state and job category
df_state_counts = df.groupby(['STATE_NAME', 'Job_Category']).size().reset_index(name='Job_Count')

# Step 5: Pivot the data to get counts for Analytics and Non-Analytics Jobs
df_pivot = df_state_counts.pivot(index='STATE_NAME', columns='Job_Category', values='Job_Count').fillna(0)
df_pivot['Total_Jobs'] = df_pivot.get('Analytics Job', 0) + df_pivot.get('Non-Analytics Job', 0)
df_pivot = df_pivot.reset_index()

# Debug: Check the aggregated data
print("Aggregated data by state:")
print(df_pivot)

# Step 6: Map state names to state codes for Plotly choropleth
state_name_to_code = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}

df_pivot['State_Code'] = df_pivot['STATE_NAME'].map(state_name_to_code)

# Step 7: Clean the data (remove rows with unmapped states)
df_pivot = df_pivot.dropna(subset=['State_Code'])

# Step 8: Apply a logarithmic scale to Total_Jobs for better visualization (avoiding zero values)
df_pivot['Log_Total_Jobs'] = np.log1p(df_pivot['Total_Jobs'])  # log1p handles zero values (log(1+x))

# Debug: Check the final data before plotting
print("Final data for plotting:")
print(df_pivot[['STATE_NAME', 'State_Code', 'Total_Jobs', 'Log_Total_Jobs']])

# Step 9: Create the choropleth map with a logarithmic color scale
fig = px.choropleth(
    df_pivot,
    locations='State_Code',
    locationmode='USA-states',
    color='Log_Total_Jobs',
    color_continuous_scale='Reds',
    scope='usa',
    title='Distribution of Jobs Across U.S. States (Log Scale)',
    hover_data=['STATE_NAME', 'Analytics Job', 'Non-Analytics Job', 'Total_Jobs'],
    labels={'Log_Total_Jobs': 'Log(Number of Jobs)', 'Total_Jobs': 'Number of Jobs'}
)

# Step 10: Update layout for styling
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
    title=dict(
        font=dict(size=24, color='#FF6B6B'),
        x=0.5,
        xanchor='center',
        y=0.95,
        yanchor='top'
    ),
    geo=dict(
        bgcolor='white',
        lakecolor='white',
        landcolor='lightgray',
        subunitcolor='black',
        showlakes=True,
        showsubunits=True
    ),
    coloraxis_colorbar=dict(
        title='Log(Number of Jobs)',
        title_font=dict(size=13),
        tickfont=dict(size=12),
        tickvals=[np.log1p(x) for x in [0, 10, 50, 100, 500, 1000]],
        ticktext=['0', '10', '50', '100', '500', '1000']
    )
)

# Step 11: Save to HTML
fig.write_html(
    'figures/edaplot6.html',
    include_plotlyjs='cdn',
    full_html=False
)
Number of rows after cleaning: 71254
Unique states extracted: ['Arkansas' 'Maine' 'Texas' 'Arizona' 'California' 'Ohio' 'New Jersey'
 'New York' 'Hawaii' 'Georgia' 'Michigan' 'Mississippi' 'Massachusetts'
 'Alaska' 'Alabama' 'Indiana' 'Virginia' 'South Carolina' 'Colorado'
 'Nevada' 'Minnesota' 'Oregon' 'Oklahoma' 'North Carolina' 'Florida'
 'Washington' 'Delaware' 'Illinois' 'Pennsylvania' 'Kansas' 'Tennessee'
 'Washington, D.C. (District of Columbia)' 'Maryland' 'Idaho' 'Louisiana'
 'Connecticut' 'Nebraska' 'Missouri' 'North Dakota' 'Utah' 'New Hampshire'
 'Wisconsin' 'Kentucky' 'Rhode Island' 'Iowa' 'South Dakota' 'Montana'
 'New Mexico' 'Wyoming' 'West Virginia' 'Vermont']
Aggregated data by state:
Job_Category                               STATE_NAME  Analytics Job  \
0                                             Alabama            376   
1                                              Alaska            133   
2                                             Arizona            673   
3                                            Arkansas            252   
4                                          California           3239   
5                                            Colorado            646   
6                                         Connecticut            390   
7                                            Delaware            219   
8                                             Florida           1526   
9                                             Georgia           1144   
10                                             Hawaii            140   
11                                              Idaho            214   
12                                           Illinois           1595   
13                                            Indiana            442   
14                                               Iowa            285   
15                                             Kansas            330   
16                                           Kentucky            279   
17                                          Louisiana            230   
18                                              Maine            193   
19                                           Maryland            791   
20                                      Massachusetts            987   
21                                           Michigan            709   
22                                          Minnesota            709   
23                                        Mississippi            267   
24                                           Missouri            587   
25                                            Montana            105   
26                                           Nebraska            221   
27                                             Nevada            237   
28                                      New Hampshire            134   
29                                         New Jersey           1022   
30                                         New Mexico            163   
31                                           New York           1752   
32                                     North Carolina           1284   
33                                       North Dakota             96   
34                                               Ohio           1149   
35                                           Oklahoma            280   
36                                             Oregon            468   
37                                       Pennsylvania            962   
38                                       Rhode Island            253   
39                                     South Carolina            333   
40                                       South Dakota            149   
41                                          Tennessee            599   
42                                              Texas           2967   
43                                               Utah            350   
44                                            Vermont            118   
45                                           Virginia           1906   
46                                         Washington            856   
47            Washington, D.C. (District of Columbia)            668   
48                                      West Virginia             76   
49                                          Wisconsin            477   
50                                            Wyoming             61   

Job_Category  Non-Analytics Job  Total_Jobs  
0                           282         658  
1                           101         234  
2                           932        1605  
3                           299         551  
4                          3813        7052  
5                           786        1432  
6                           447         837  
7                           214         433  
8                          2078        3604  
9                          1481        2625  
10                           97         237  
11                          232         446  
12                         1912        3507  
13                          487         929  
14                          307         592  
15                          382         712  
16                          325         604  
17                          198         428  
18                          149         342  
19                          548        1339  
20                         1027        2014  
21                         1097        1806  
22                          740        1449  
23                          172         439  
24                          611        1198  
25                           78         183  
26                          291         512  
27                          319         556  
28                          133         267  
29                         1569        2591  
30                           91         254  
31                         1579        3331  
32                         1426        2710  
33                           50         146  
34                         1442        2591  
35                          298         578  
36                          593        1061  
37                         1284        2246  
38                          176         429  
39                          288         621  
40                          139         288  
41                          643        1242  
42                         5083        8050  
43                          259         609  
44                          108         226  
45                         1685        3591  
46                          744        1600  
47                          551        1219  
48                           80         156  
49                          544        1021  
50                           42         103  
Final data for plotting:
Job_Category      STATE_NAME State_Code  Total_Jobs  Log_Total_Jobs
0                    Alabama         AL         658        6.490724
1                     Alaska         AK         234        5.459586
2                    Arizona         AZ        1605        7.381502
3                   Arkansas         AR         551        6.313548
4                 California         CA        7052        8.861208
5                   Colorado         CO        1432        7.267525
6                Connecticut         CT         837        6.731018
7                   Delaware         DE         433        6.073045
8                    Florida         FL        3604        8.190077
9                    Georgia         GA        2625        7.873217
10                    Hawaii         HI         237        5.472271
11                     Idaho         ID         446        6.102559
12                  Illinois         IL        3507        8.162801
13                   Indiana         IN         929        6.835185
14                      Iowa         IA         592        6.385194
15                    Kansas         KS         712        6.569481
16                  Kentucky         KY         604        6.405228
17                 Louisiana         LA         428        6.061457
18                     Maine         ME         342        5.837730
19                  Maryland         MD        1339        7.200425
20             Massachusetts         MA        2014        7.608374
21                  Michigan         MI        1806        7.499423
22                 Minnesota         MN        1449        7.279319
23               Mississippi         MS         439        6.086775
24                  Missouri         MO        1198        7.089243
25                   Montana         MT         183        5.214936
26                  Nebraska         NE         512        6.240276
27                    Nevada         NV         556        6.322565
28             New Hampshire         NH         267        5.590987
29                New Jersey         NJ        2591        7.860185
30                New Mexico         NM         254        5.541264
31                  New York         NY        3331        8.111328
32            North Carolina         NC        2710        7.905073
33              North Dakota         ND         146        4.990433
34                      Ohio         OH        2591        7.860185
35                  Oklahoma         OK         578        6.361302
36                    Oregon         OR        1061        6.967909
37              Pennsylvania         PA        2246        7.717351
38              Rhode Island         RI         429        6.063785
39            South Carolina         SC         621        6.432940
40              South Dakota         SD         288        5.666427
41                 Tennessee         TN        1242        7.125283
42                     Texas         TX        8050        8.993552
43                      Utah         UT         609        6.413459
44                   Vermont         VT         226        5.424950
45                  Virginia         VA        3591        8.186464
46                Washington         WA        1600        7.378384
48             West Virginia         WV         156        5.056246
49                 Wisconsin         WI        1021        6.929517
50                   Wyoming         WY         103        4.644391